Import Statements¶

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import country_converter as coco
import chardet

cc = coco.CountryConverter()

Notebook Presentation¶

In [2]:
pd.options.display.float_format = '{:,.3f}'.format
pd.options.mode.chained_assignment = None

Load the Data¶

The first column in the .csv file just has the row numbers, so it will be used as the index.

In [3]:
# Data source - https://www.kaggle.com/datasets/mayankanand2701/global-covid-19-dataset/data

with open('Global COVID-19 Dataset.csv', 'rb') as f:
    result = chardet.detect(f.read())

df = pd.read_csv('Global COVID-19 Dataset.csv', encoding=result['encoding'], index_col=0)
result
Out[3]:
{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

Preliminary Data Exploration¶

In [4]:
df.sample(4)
Out[4]:
Country Name Cases Deaths Recovered
S. No.
163.000 Qatar 514,524.000 690.000 513,834.000
32.000 Russia 24,158,502.000 402,821.000 23,755,681.000
2.000 Bulgaria 1,329,266.000 38,700.000 1,290,566.000
45.000 Guam 52,287.000 419.000 51,868.000
In [5]:
df.shape
Out[5]:
(238, 4)
In [6]:
df.columns
Out[6]:
Index(['Country Name', 'Cases', 'Deaths', 'Recovered'], dtype='object')
In [7]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 238 entries, 1.0 to nan
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Country Name  230 non-null    object 
 1   Cases         230 non-null    float64
 2   Deaths        230 non-null    float64
 3   Recovered     230 non-null    float64
dtypes: float64(3), object(1)
memory usage: 9.3+ KB
In [8]:
df.tail(10)
Out[8]:
Country Name Cases Deaths Recovered
S. No.
229.000 Chad 7,702.000 194.000 7,508.000
230.000 Burundi 54,569.000 15.000 54,554.000
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN

Data Cleaning¶

In [9]:
df.isna().values.any()
Out[9]:
True
In [10]:
df.isna().values.sum()
Out[10]:
32
In [11]:
df.isna().values
col_subset = ['Country Name', 'Cases', 'Deaths', 'Recovered']
df.loc[df.Cases.isna()][col_subset]
Out[11]:
Country Name Cases Deaths Recovered
S. No.
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
NaN NaN NaN NaN NaN
In [12]:
clean_df = df.dropna()
In [13]:
clean_df.duplicated(keep=False).values.any()
Out[13]:
False

Characters cleaning: convert the data in the Cases, Deaths, Recovered columns and remove all the non-numeric characters¶

In [14]:
columns_to_clean = ['Cases', 'Deaths', 'Recovered']

for col in columns_to_clean:
    clean_df[col] = clean_df[col].astype(str).str.replace(',', "")
    # Convert column to a numeric data type
    clean_df[col] = pd.to_numeric(clean_df[col]).astype(int)

clean_df
Out[14]:
Country Name Cases Deaths Recovered
S. No.
1.000 Peru 4524748 220831 4303917
2.000 Bulgaria 1329266 38700 1290566
3.000 Bosnia and Herzegovina 403638 16388 387250
4.000 Hungary 2230381 49051 2181330
5.000 North Macedonia 350589 9977 340612
... ... ... ... ...
226.000 Tajikistan 17786 125 17661
227.000 Benin 28036 163 27873
228.000 Niger 9515 315 9200
229.000 Chad 7702 194 7508
230.000 Burundi 54569 15 54554

230 rows × 4 columns

Descriptive Statistics and Additional Calculations¶

In [15]:
clean_df["Death_Rate"] = (clean_df["Deaths"] / clean_df["Cases"]).round(3)
In [16]:
newdf = clean_df
In [17]:
newdf.describe()
Out[17]:
Cases Deaths Recovered Death_Rate
count 230.000 230.000 230.000 230.000
mean 4,234,399.339 36,262.422 4,198,136.917 0.013
std 16,639,054.089 134,860.421 16,523,705.023 0.016
min 1,403.000 1.000 1,395.000 0.000
25% 30,557.000 190.250 30,240.250 0.004
50% 224,696.000 2,147.500 222,791.500 0.009
75% 1,363,680.250 16,367.250 1,353,812.500 0.017
max 185,636,059.000 1,261,370.000 184,374,689.000 0.181
  • Std is greater than the mean df, which may indicate large differences between values and non-normal distribution of the data.
  • This distribution is generally expected for a given df.
In [18]:
print(f'Country with the maximum number of cases: {newdf.sort_values(by=["Cases"], ascending=False, ignore_index=True).loc[0, "Country Name"]}')
print(f'Country with the maximum number of deaths: {newdf.sort_values(by=["Deaths"], ascending=False, ignore_index=True).loc[0, "Country Name"]}')
print(f'Country with the maximum number of recovered people: {newdf.sort_values(by=["Recovered"], ascending=False, ignore_index=True).loc[0, "Country Name"]}')
Country with the maximum number of cases: European Union
Country with the maximum number of deaths: European Union
Country with the maximum number of recovered people: European Union
In [19]:
newdf[newdf["Country Name"] == "European Union"]
Out[19]:
Country Name Cases Deaths Recovered Death_Rate
S. No.
31.000 European Union 185636059 1261370 184374689 0.007
In [20]:
# We interested in data about individual countries, so let's drop data about EU

newdf.drop(index=31.000, axis=0, inplace=True)
newdf.sort_values(by=["Cases"], ascending=False)
Out[20]:
Country Name Cases Deaths Recovered Death_Rate
S. No.
17.000 United States 103436829 1186984 102249845 0.011
194.000 China 99354727 122196 99232531 0.001
153.000 India 45037776 533589 44504187 0.012
39.000 France 38997490 168091 38829399 0.004
54.000 Germany 38437756 174979 38262777 0.005
... ... ... ... ... ...
121.000 Wallis and Futuna 3760 9 3751 0.002
173.000 Macau 3514 121 3393 0.034
159.000 Saint Pierre and Miquelon 3426 2 3424 0.001
193.000 Tuvalu 2943 1 2942 0.000
68.000 Montserrat 1403 8 1395 0.006

229 rows × 5 columns

In [21]:
# add ISO-3 country codes
newdf["ISO"] = newdf["Country Name"].apply(lambda x: coco.convert(names=x, to='ISO3', not_found=None))
df = newdf
df
Out[21]:
Country Name Cases Deaths Recovered Death_Rate ISO
S. No.
1.000 Peru 4524748 220831 4303917 0.049 PER
2.000 Bulgaria 1329266 38700 1290566 0.029 BGR
3.000 Bosnia and Herzegovina 403638 16388 387250 0.041 BIH
4.000 Hungary 2230381 49051 2181330 0.022 HUN
5.000 North Macedonia 350589 9977 340612 0.028 MKD
... ... ... ... ... ... ...
226.000 Tajikistan 17786 125 17661 0.007 TJK
227.000 Benin 28036 163 27873 0.006 BEN
228.000 Niger 9515 315 9200 0.033 NER
229.000 Chad 7702 194 7508 0.025 TCD
230.000 Burundi 54569 15 54554 0.000 BDI

229 rows × 6 columns

Visualise the Features¶

Number of Cases vs Deaths¶

In [22]:
fig = px.scatter(df, x='Cases', y='Deaths', color='Deaths', size='Deaths', hover_name='Country Name', 
                 color_continuous_scale=px.colors.sequential.Bluered)

fig.update_layout(xaxis_title="Number of Cases", yaxis_title="Number of Deaths", height=500, 
                  xaxis=dict(range=[0, 120000000]), yaxis=dict(range=[0, 1300000]))

fig.show()

According to the data set, the maximum number of Covid-19 cases (about 100 million) was observed in 2 countries - the USA and China. Despite the large number of infected people in these countries, the number of deaths varies sharply. Unlike the United States with more than 1 million deaths, in China about 120 thousand deaths have been registered, which is 9,7 times less than in the United States.

India is in 3rd place with 45 million Covid-19 cases, followed in descending order by France, Germany, Brazil, South Korea and Japan.

Case-fatality ratio in the top 10 countries (by Number of Cases) 🔺¶

In [23]:
top10 = df.sort_values(by=["Cases"], ascending=False).iloc[:10]
top10['Death_Rate_Percent'] = top10['Death_Rate'] * 100
top10.head(10)
Out[23]:
Country Name Cases Deaths Recovered Death_Rate ISO Death_Rate_Percent
S. No.
17.000 United States 103436829 1186984 102249845 0.011 USA 1.100
194.000 China 99354727 122196 99232531 0.001 CHN 0.100
153.000 India 45037776 533589 44504187 0.012 IND 1.200
39.000 France 38997490 168091 38829399 0.004 FRA 0.400
54.000 Germany 38437756 174979 38262777 0.005 DEU 0.500
22.000 Brazil 37519960 702116 36817844 0.019 BRA 1.900
130.000 South Korea 34571873 35934 34535939 0.001 KOR 0.100
134.000 Japan 33803572 74694 33728878 0.002 JPN 0.200
21.000 Italy 26721656 197001 26524655 0.007 ITA 0.700
20.000 United Kingdom 24924986 232112 24692874 0.009 GBR 0.900
In [24]:
plt.figure(figsize=(9,4))
plt.xticks(fontsize=12, rotation=45)
plt.yticks(fontsize=12)

ax1 = plt.gca() # get current axes
ax2 = ax1.twinx() 


ax1.bar(top10['Country Name'], top10['Cases'], color='grey')

ax2.plot(top10['Country Name'], top10['Death_Rate_Percent'], color='red', linewidth=2, marker='o')  
ax1.set_ylabel('Nr of Covid-19 Cases', fontsize=12, color='Grey')
ax2.set_ylabel('Case-fatality ratio, %', fontsize=12, color='Red')
Out[24]:
Text(0, 0.5, 'Case-fatality ratio, %')
No description has been provided for this image

The case fatality rate in the top 10 countries by number of Covid-19 cases does not exceed 2%, with the highest values ​​in Brazil and India (1.9% and 1.2%, respectively).

Next question is - Which countries had the highest case-fatality ratio?🔻

Top 20 countries with the most observed case-fatality ratio¶

In [25]:
top20 = df.sort_values(by=["Death_Rate"], ascending=False).iloc[:20]
top20['Death_Rate_Percent'] = top20['Death_Rate'] * 100
In [26]:
# bar chart
fig_bar = px.bar(top20, x='Country Name', y='Death_Rate_Percent', color="Death_Rate_Percent", 
                 text=top20['Death_Rate_Percent'].round(1),
                 labels={'Country Name':'', "Death_Rate_Percent":'Case-Fatality Ratio, %'},
                 color_continuous_scale=px.colors.diverging.Portland)
fig_bar.update_layout(height=400)

# choropleth map
fig_map = px.choropleth(top20, locations="ISO", color="Death_Rate_Percent", 
                        hover_name='Country Name', hover_data="Death_Rate_Percent",
                        projection='natural earth', 
                        color_continuous_scale=px.colors.diverging.Portland)

# Show the figures side by side using HTML divs
from IPython.display import display, HTML
display(HTML('<div>' + fig_bar.to_html(full_html=False) + '<div>' + fig_map.to_html(full_html=False) +'</div>' + '</div>'))

The highest case-fatality ratio reaches 18% in Yemen and exceeds the 2% threshold for most other countries in the Global South.

Number of Covid-19 cases on the World Map¶

In [27]:
fig = px.choropleth(df, locations="ISO",
                    color="Cases",
                    hover_name = 'Country Name',
                    hover_data = "Cases",
                    projection = 'natural earth',
                    color_continuous_scale=px.colors.sequential.Sunsetdark,
                    title = 'Number of Covid-19 cases',
                    width = 800,
                    height = 550,
                    )

fig.show()

Covid-19 case-fatality ratio on the World Map¶

In [28]:
fig = px.choropleth(df, locations="ISO",
                    color="Death_Rate",
                    hover_name = 'Country Name',
                    hover_data = "Death_Rate",
                    projection = 'natural earth',
                    color_continuous_scale=px.colors.sequential.Redor,
                    title = 'Covid-19 Death Rate',
                    width = 800,
                    height = 550,
                    )

fig.show()